library(ggplot2)
library(data.table)
library(scales)
library(RColorBrewer)
library(reticulate)
library(uwot)
## Loading required package: Matrix
library(gridExtra)
library(spatstat)
## Loading required package: spatstat.data
## Loading required package: nlme
## Loading required package: rpart
##
## spatstat 1.64-1 (nickname: 'Help you I can, yes!')
## For an introduction to spatstat, type 'beginner'
##
## Note: R version 3.6.0 (2019-04-26) is more than a year old; we strongly recommend upgrading to the latest version
##
## Attaching package: 'spatstat'
## The following object is masked from 'package:scales':
##
## rescale
## The following object is masked from 'package:data.table':
##
## shift
library(ggdendro)
library(cowplot)
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
library(tidyverse)
## Registered S3 method overwritten by 'cli':
## method from
## print.boxx spatstat
## ── Attaching packages ───────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 3.0.1 ✓ dplyr 1.0.0
## ✓ tidyr 1.1.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ✓ purrr 0.3.4
## ── Conflicts ──────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::between() masks data.table::between()
## x readr::col_factor() masks scales::col_factor()
## x dplyr::collapse() masks nlme::collapse()
## x dplyr::combine() masks gridExtra::combine()
## x purrr::discard() masks scales::discard()
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks data.table::first()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks data.table::last()
## x tidyr::pack() masks Matrix::pack()
## x purrr::transpose() masks data.table::transpose()
## x tidyr::unpack() masks Matrix::unpack()
library(grid)
library(gridExtra) # add to AWS
library(cowplot) # add to AWS
library(slingshot) # add to AWS
## Loading required package: princurve
library(viridis) # add to AWS
## Loading required package: viridisLite
##
## Attaching package: 'viridis'
## The following object is masked from 'package:scales':
##
## viridis_pal
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
## The following objects are masked from 'package:data.table':
##
## dcast, melt
library(RColorBrewer)
library(ggrepel)
use_condaenv(condaenv = 'r-reticulate', required = TRUE)
data_dir <- here::here('..','data')
load(file.path(data_dir, 'diff.sampled.Rdata'))
diff_dt <- fread(file.path(data_dir, 'diff.sampled.csv.gz'))
censor_negative_min = -1500
redo_umap_clustering = F
redo_umap_param_search = F
#only redo param search if doing clustering also:
redo_umap_clustering = redo_umap_clustering && redo_umap_clustering
# map day 0 to both plus and minus
map_day_0 <- function(df) {
return(rbind(
df[k562!='none'], #gets all cd19+-
#df[rep(keep_none, .N) & k562=='none' & day > 0],
df[k562=='none' & day == 0][, k562 := 'cd19+'], #none and day 0, assign as 19+
df[k562=='none' & day == 0][, k562 := 'cd19-'] #none and day 0, assign as 19-
))
}
'%notin%' <- Negate('%in%')
#filter diff_dt by cd4/8 type, cd19+ (map day 0), donor == 1 or 2
#cd4, cd19+, donor
diff_dt_cd4 <- diff_dt %>% map_day_0() %>% filter(t_type == 'cd4', k562=='cd19+', donor == 2)
#cd8, cd19+, donor
diff_dt_cd8 <- diff_dt %>% map_day_0() %>% filter(t_type == 'cd8', k562=='cd19+', donor == 2)
For now, skip straight to high dimensional plotting.
#grab channel map from cd4 data (cd4/8 are the same)
channel_map <- diff_opt_cd4$channel_map
# for now use all variables in the channel map except cd4/8
umap_vars <- c(paste0(names(channel_map),'_t'))
# removing zombie, cd_t, myc_t, gfp_t
umap_vars <- umap_vars[!umap_vars %in% c('cd_t', 'zombie_t', 'myc_t', 'gfp_t')]
run_umap <- function(dt, chosen_dist, chosen_n_neighbor, sample_n, umap_vars,
chosen_learning_rate=0.1) {
umap_dt <- dt[!is.na(event_id),
.SD[event_id %in% sample(unique(event_id),
min(sample_n, length(unique(event_id))))],
by=c('well', 'plate', 'day')]
# for now use all variables in the channel map
#umap_vars <- c(paste0(names(channel_map),'_t'))
#cast it so variables are columns and
#subset sample umap data on variables
umap_cast_dt <- data.table(dcast(umap_dt,
event_id + well + plate + day + t_type ~ variable,
value.var='value'))
umap_dt_in <- umap_cast_dt[, ..umap_vars]
# scale each input column
umap_dt_in[ ,
(names(umap_dt_in)) := lapply(.SD, scale), .SDcols = names(umap_dt_in)]
# run umap via uwot library
umap_out <- umap(umap_dt_in, min_dist=chosen_dist,
learning_rate=chosen_learning_rate,
n_sgd_threads=1,
n_neighbors=chosen_n_neighbor,
verbose=T, n_threads=8, n_trees=50,
ret_nn=T)
# nearest neighbors in original space
nearest_neighbors <- umap_out$nn$euclidean$idx
neighbor_dist <- umap_out$nn$euclidean$dist
edge_weights <- scales::rescale(-neighbor_dist, to=c(0,1))
edge_weights <- edge_weights - min(edge_weights)
umap_out <- data.table(umap_out$embedding)
#nearest neighbors in embedding
# umap_nn <- cbind(1:nrow(umap_fcs_dt),
# nnwhich(umap_fcs_dt[, list(umap_1, umap_2)], k=c(1:3)))
# umap_nd <- cbind(rep(0, nrow(umap_fcs_dt)),
# nndist(umap_fcs_dt[, list(umap_1, umap_2)], k=c(1:3)))
# umap_ew <- scales::rescale(-umap_nd, to=c(0,1))
# umap_ew <- umap_ew - min(umap_ew)
# rename the columns
names(umap_out) <- c('umap_1','umap_2')
# add the umap output to the input dt
umap_fcs_dt <- cbind(umap_cast_dt[, 1:length(names(umap_cast_dt))], umap_out)
umap_fcs_dt[, id := 1:.N]
# add back the annotations
umap_fcs_dt <- unique(umap_dt[,
.(donor, car, k562, t_type, day, well, event_id)])[
umap_fcs_dt, on=.(t_type,well,day,event_id)]
source_python(here::here('py/leiden_clust.py'))
clust_membership <- leiden_clust(nearest_neighbors,
edge_weights=scales::rescale(-neighbor_dist, to=c(0,1)))
umap_fcs_dt[, cluster := factor(clust_membership+1)]
#renumber the clusters
cluster_ranks <- umap_fcs_dt[, list(mean_day=mean(day)), by=cluster][, rank_day := rank(mean_day)]
umap_fcs_dt <- umap_fcs_dt[cluster_ranks, on='cluster'][, c('cluster', 'rank_day') := list(rank_day, NULL)]
return(umap_fcs_dt)
}
Scaling this to 200 events per well and checking CAR/condition separation.
umap_cd4_fcs_dt <- run_umap(diff_dt_cd4, 0.005, 15, 200, umap_vars)
umap_cd8_fcs_dt <- run_umap(diff_dt_cd8, 0.005, 15, 200, umap_vars)
fwrite(umap_cd4_fcs_dt,
compress='gzip',
file=file.path(here::here('..','data','diff.sampled.umap_cd4.csv.gz')))
fwrite(umap_cd8_fcs_dt,
compress='gzip',
file=file.path(here::here('..','data','diff.sampled.umap_cd8.csv.gz')))
# sync output back to s3
system('aws s3 sync \\
--exclude "*" \\
--include "*.Rdata" \\
--include "*.csv*" \\
~/local-tcsl/data s3://roybal-tcsl//lenti_screen_compiled_data/data')
#read in cd8 umap data
umap_cd8_fcs_dt <- fread(
file.path(
here::here('..','data','diff.sampled.umap_cd8.csv.gz')))
umap_cd8_fcs_dt[, cluster := factor(cluster)]
umap_cd8_cluster_dt <- umap_cd8_fcs_dt[, list(
mean_umap_1= mean(umap_1),
mean_umap_2= mean(umap_2),
size= .N), by=cluster]
#read in cd4 umap data
umap_cd4_fcs_dt <- fread(
file.path(
here::here('..','data','diff.sampled.umap_cd4.csv.gz')))
umap_cd4_fcs_dt[, cluster := factor(cluster)]
umap_cd4_cluster_dt <- umap_cd4_fcs_dt[, list(
mean_umap_1= mean(umap_1),
mean_umap_2= mean(umap_2),
size= .N), by=cluster]
#plot umap
ggumap <- function(df, df_cluster) {
# whole plot
umap_whole <- ggplot() +
geom_point(data=df,
aes(x=umap_1, y=umap_2, color=cluster), size=0.2, alpha=0.2) +
geom_label(data=df_cluster, aes(
x=mean_umap_1, y=mean_umap_2,
label=paste(cluster,size,sep='\n'),
color=cluster), alpha=0.3) +
theme_minimal() + ggtitle('UMAP by Cluster')
# individual plots
umap_individual <- ggplot() +
geom_point(data=df,
aes(x=umap_1, y=umap_2, color=cluster), size=0.2, alpha=0.2) +
geom_label(data=df_cluster, aes(
x=mean_umap_1, y=mean_umap_2,
label=paste(cluster,size,sep='\n'),
color=cluster), alpha=0.3) +
facet_wrap(~cluster) +
theme_bw()
# UMAP by day
label_days_umap <- df %>% group_by(day) %>% select(umap_1, umap_2) %>% summarize_all(mean)
umap_by_day <- ggplot(df, aes(x=umap_1, y=umap_2, color=as.factor(day)))+geom_point(size=0.1,alpha = 0.5, position=position_jitter(h=0.15,w=0.15))+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+geom_label_repel(aes(label=day), data=label_days_umap)+guides(colour=FALSE)+ggtitle('UMAP by Day') + scale_color_viridis(discrete=TRUE)+theme_minimal()
grid.arrange(umap_by_day, umap_whole, umap_individual, nrow=3)
}
#CD8
ggumap(umap_cd8_fcs_dt, umap_cd8_cluster_dt)
## Adding missing grouping variables: `day`
#CD4
ggumap(umap_cd4_fcs_dt, umap_cd4_cluster_dt)
## Adding missing grouping variables: `day`
color_plots <- function(umap_df) {
cd4_colors = brewer.pal('Greens', n=9)[c(2,4,6,8,9)]
cd8_colors = brewer.pal('Oranges', n=9)[c(2,4,6,8,9)]
color_time <- ggplot(umap_df[][, cd19 := (k562=='cd19+')],
aes(x=umap_1, y=umap_2, color=interaction(day, t_type))) +
geom_point(size=0.1, alpha=0.5) +
facet_grid(car~cd19) +
scale_color_manual(values=c(cd4_colors, cd8_colors)) +
guides(colour = guide_legend(override.aes = list(alpha=1, size=3)))
color_density <- ggplot(umap_df, aes(x=umap_1, y=umap_2)) +
geom_hex(bins = 70) +
facet_grid(car~cd19) +
scale_fill_continuous(
type = "viridis", limits=c(0,30), oob=scales::squish) +
theme_bw()
color_markers <- ggplot(
melt(umap_df, measure.vars=umap_vars)[,
value.scaled := scale(value), by='variable'][,
cd19 := (k562=='cd19+')],
aes(x=umap_1, y=umap_2, z=value.scaled)) +
stat_summary_hex(bins = 70) +
facet_grid(variable~cd19) +
scale_fill_distiller(palette='RdYlBu', limits=c(-3, 3), oob=scales::squish) +
theme_bw()
grid.arrange(color_time, color_density, color_markers, ncol=3)
}
#cd8
color_plots(umap_cd8_fcs_dt)
#cd4
color_plots(umap_cd4_fcs_dt)